 @article{kuppusamy2021convolutional,
 	title={Convolutional and Deep Neural Networks based techniques for extracting the age-relevant features of the speaker},
 	author={Kuppusamy, Karthika and Eswaran, Chandra},
 	journal={Journal of Ambient Intelligence and Humanized Computing},
 	pages={1--13},
 	year={2021},
 	publisher={Springer}
 }

 @article{kersta1962voiceprint,
  title={Voiceprint identification},
  author={Kersta, Lawrence George},
  journal={The Journal of the Acoustical Society of America},
  volume={34},
  number={5},
  pages={725--725},
  year={1962},
  publisher={Acoustical Society of America}
}

@article{pruzansky1963pattern,
  title={Pattern-Matching Procedure for Automatic Talker Recognition},
  author={Pruzansky, Sandra},
  journal={The Journal of the Acoustical Society of America},
  volume={35},
  number={3},
  pages={354--358},
  year={1963},
  publisher={Acoustical Society of America}
}

@article{atal1971speech,
  title={Speech analysis and synthesis by linear prediction of the speech wave},
  author={Atal, Bishnu S and Hanauer, Suzanne L},
  journal={The journal of the acoustical society of America},
  volume={50},
  number={2B},
  pages={637--655},
  year={1971},
  publisher={Acoustical Society of America}
}

@article{atal1974effectiveness,
  title={Effectiveness of linear prediction characteristics of the speech wave for automatic speaker identification and verification},
  author={Atal, Bishnu S},
  journal={the Journal of the Acoustical Society of America},
  volume={55},
  number={6},
  pages={1304--1312},
  year={1974},
  publisher={Acoustical Society of America}
}

@article{sakoe1978dynamic,
  title={Dynamic programming algorithm optimization for spoken word recognition},
  author={Sakoe, Hiroaki and Chiba, Seibi},
  journal={IEEE transactions on acoustics, speech, and signal processing},
  volume={26},
  number={1},
  pages={43--49},
  year={1978},
  publisher={IEEE}
}

@inproceedings{burton1983generalization,
  title={A generalization of isolated word recognition using vector quantization},
  author={Burton, D and Shore, J and Buck, J},
  booktitle={ICASSP'83. IEEE International Conference on Acoustics, Speech, and Signal Processing},
  volume={8},
  pages={1021--1024},
  year={1983},
  organization={IEEE}
}

@article{rabiner1986introduction,
  title={An introduction to hidden Markov models},
  author={Rabiner, Lawrence and Juang, Biinghwang},
  journal={ieee assp magazine},
  volume={3},
  number={1},
  pages={4--16},
  year={1986},
  publisher={IEEE}
}

@article{jain1996artificial,
  title={Artificial neural networks: A tutorial},
  author={Jain, Anil K and Mao, Jianchang and Mohiuddin, K Moidin},
  journal={Computer},
  volume={29},
  number={3},
  pages={31--44},
  year={1996},
  publisher={IEEE}
}

@article{reynolds1995robust,
  title={Robust text-independent speaker identification using Gaussian mixture speaker models},
  author={Reynolds, Douglas A and Rose, Richard C},
  journal={IEEE transactions on speech and audio processing},
  volume={3},
  number={1},
  pages={72--83},
  year={1995},
  publisher={IEEE}
}

@article{campbell2006support,
  title={Support vector machines using GMM supervectors for speaker verification},
  author={Campbell, William M and Sturim, Douglas E and Reynolds, Douglas A},
  journal={IEEE signal processing letters},
  volume={13},
  number={5},
  pages={308--311},
  year={2006},
  publisher={IEEE}
}

@inproceedings{variani2014deep,
  title={Deep neural networks for small footprint text-dependent speaker verification},
  author={Variani, Ehsan and Lei, Xin and McDermott, Erik and Moreno, Ignacio Lopez and Gonzalez-Dominguez, Javier},
  booktitle={2014 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
  pages={4052--4056},
  year={2014},
  organization={IEEE}
}

@article{chen2015locally,
  title={Locally-connected and convolutional neural networks for small footprint speaker recognition},
  author={Chen, Yu-hsin and Moreno, Ignacio Lopez and Sainath, Tara and Visontai, Mirk{\'o} and Alvarez, Raziel and Parada, Carolina},
  year={2015}
}

@article{reynolds2000speaker,
  title={Speaker verification using adapted Gaussian mixture models},
  author={Reynolds, Douglas A and Quatieri, Thomas F and Dunn, Robert B},
  journal={Digital signal processing},
  volume={10},
  number={1-3},
  pages={19--41},
  year={2000},
  publisher={Elsevier}
}

@inproceedings{heigold2016end,
  title={End-to-end text-dependent speaker verification},
  author={Heigold, Georg and Moreno, Ignacio and Bengio, Samy and Shazeer, Noam},
  booktitle={2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={5115--5119},
  year={2016},
  organization={IEEE}
}

@inproceedings{snyder2017deep,
  title={Deep Neural Network Embeddings for Text-Independent Speaker Verification.},
  author={Snyder, David and Garcia-Romero, Daniel and Povey, Daniel and Khudanpur, Sanjeev},
  booktitle={Interspeech},
  pages={999--1003},
  year={2017}
}

@article{li2017deep,
  title={Deep speaker: an end-to-end neural speaker embedding system},
  author={Li, Chao and Ma, Xiaokong and Jiang, Bing and Li, Xiangang and Zhang, Xuewei and Liu, Xiao and Cao, Ying and Kannan, Ajay and Zhu, Zhenyao},
  journal={arXiv preprint arXiv:1705.02304},
  year={2017}
}

@inproceedings{kenny2013plda,
  title={PLDA for speaker verification with utterances of arbitrary duration},
  author={Kenny, Patrick and Stafylakis, Themos and Ouellet, Pierre and Alam, Md Jahangir and Dumouchel, Pierre},
  booktitle={2013 IEEE International Conference on Acoustics, Speech and Signal Processing},
  pages={7649--7653},
  year={2013},
  organization={IEEE}
}

@inproceedings{prince2007probabilistic,
  title={Probabilistic linear discriminant analysis for inferences about identity},
  author={Prince, Simon JD and Elder, James H},
  booktitle={2007 IEEE 11th international conference on computer vision},
  pages={1--8},
  year={2007},
  organization={IEEE}
}

@inproceedings{snyder2018x,
  title={X-vectors: Robust dnn embeddings for speaker recognition},
  author={Snyder, David and Garcia-Romero, Daniel and Sell, Gregory and Povey, Daniel and Khudanpur, Sanjeev},
  booktitle={2018 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
  pages={5329--5333},
  year={2018},
  organization={IEEE}
}

@inproceedings{he2016deep,
  title={Deep residual learning for image recognition},
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={770--778},
  year={2016}
}

@article{desplanques2020ecapa,
  title={Ecapa-tdnn: Emphasized channel attention, propagation and aggregation in tdnn based speaker verification},
  author={Desplanques, Brecht and Thienpondt, Jenthe and Demuynck, Kris},
  journal={arXiv preprint arXiv:2005.07143},
  year={2020}
}

@article{kinnunen2010overview,
  title={An overview of text-independent speaker recognition: From features to supervectors},
  author={Kinnunen, Tomi and Li, Haizhou},
  journal={Speech communication},
  volume={52},
  number={1},
  pages={12--40},
  year={2010},
  publisher={Elsevier}
}

@article{gardner1998artificial,
  title={Artificial neural networks (the multilayer perceptron)—a review of applications in the atmospheric sciences},
  author={Gardner, Matt W and Dorling, SR},
  journal={Atmospheric environment},
  volume={32},
  number={14-15},
  pages={2627--2636},
  year={1998},
  publisher={Elsevier}
}

@incollection{bengio2012practical,
  title={Practical recommendations for gradient-based training of deep architectures},
  author={Bengio, Yoshua},
  booktitle={Neural networks: Tricks of the trade},
  pages={437--478},
  year={2012},
  publisher={Springer}
}

@article{hornik1989multilayer,
  title={Multilayer feedforward networks are universal approximators},
  author={Hornik, Kurt and Stinchcombe, Maxwell and White, Halbert},
  journal={Neural networks},
  volume={2},
  number={5},
  pages={359--366},
  year={1989},
  publisher={Elsevier}
}

@inproceedings{peddinti2015time,
  title={A time delay neural network architecture for efficient modeling of long temporal contexts},
  author={Peddinti, Vijayaditya and Povey, Daniel and Khudanpur, Sanjeev},
  booktitle={Sixteenth annual conference of the international speech communication association},
  year={2015}
}

@inproceedings{hu2018squeeze,
  title={Squeeze-and-excitation networks},
  author={Hu, Jie and Shen, Li and Sun, Gang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={7132--7141},
  year={2018}
}

@article{gao2019res2net,
  title={Res2net: A new multi-scale backbone architecture},
  author={Gao, Shang-Hua and Cheng, Ming-Ming and Zhao, Kai and Zhang, Xin-Yu and Yang, Ming-Hsuan and Torr, Philip},
  journal={IEEE transactions on pattern analysis and machine intelligence},
  volume={43},
  number={2},
  pages={652--662},
  year={2019},
  publisher={IEEE}
}

@article{okabe2018attentive,
  title={Attentive statistics pooling for deep speaker embedding},
  author={Okabe, Koji and Koshinaka, Takafumi and Shinoda, Koichi},
  journal={arXiv preprint arXiv:1803.10963},
  year={2018}
}

@article{wang2018additive,
  title={Additive margin softmax for face verification},
  author={Wang, Feng and Cheng, Jian and Liu, Weiyang and Liu, Haijun},
  journal={IEEE Signal Processing Letters},
  volume={25},
  number={7},
  pages={926--930},
  year={2018},
  publisher={IEEE}
}

@inproceedings{deng2019arcface,
  title={Arcface: Additive angular margin loss for deep face recognition},
  author={Deng, Jiankang and Guo, Jia and Xue, Niannan and Zafeiriou, Stefanos},
  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
  pages={4690--4699},
  year={2019}
}

@article{india2019self,
  title={Self multi-head attention for speaker recognition},
  author={India, Miquel and Safari, Pooyan and Hernando, Javier},
  journal={arXiv preprint arXiv:1906.09890},
  year={2019}
}

@inproceedings{yamamoto2019speaker,
  title={Speaker Augmentation and Bandwidth Extension for Deep Speaker Embedding.},
  author={Yamamoto, Hitoshi and Lee, Kong Aik and Okabe, Koji and Koshinaka, Takafumi},
  booktitle={Interspeech},
  pages={406--410},
  year={2019}
}

@inproceedings{ko2015audio,
  title={Audio augmentation for speech recognition},
  author={Ko, Tom and Peddinti, Vijayaditya and Povey, Daniel and Khudanpur, Sanjeev},
  booktitle={Sixteenth annual conference of the international speech communication association},
  year={2015}
}

@article{snyder2015musan,
  title={Musan: A music, speech, and noise corpus},
  author={Snyder, David and Chen, Guoguo and Povey, Daniel},
  journal={arXiv preprint arXiv:1510.08484},
  year={2015}
}

@article{park2019specaugment,
  title={Specaugment: A simple data augmentation method for automatic speech recognition},
  author={Park, Daniel S and Chan, William and Zhang, Yu and Chiu, Chung-Cheng and Zoph, Barret and Cubuk, Ekin D and Le, Quoc V},
  journal={arXiv preprint arXiv:1904.08779},
  year={2019}
}

@inproceedings{wang2020investigation,
  title={Investigation of specaugment for deep speaker embedding learning},
  author={Wang, Shuai and Rohdin, Johan and Plchot, Old{\v{r}}ich and Burget, Luk{\'a}{\v{s}} and Yu, Kai and {\v{C}}ernock{\`y}, Jan},
  booktitle={ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={7139--7143},
  year={2020},
  organization={IEEE}
}

@article{nagrani2017voxceleb,
  title={Voxceleb: a large-scale speaker identification dataset},
  author={Nagrani, Arsha and Chung, Joon Son and Zisserman, Andrew},
  journal={arXiv preprint arXiv:1706.08612},
  year={2017}
}

@article{nagrani2020voxceleb,
  title={Voxceleb: Large-scale speaker verification in the wild},
  author={Nagrani, Arsha and Chung, Joon Son and Xie, Weidi and Zisserman, Andrew},
  journal={Computer Speech \& Language},
  volume={60},
  pages={101027},
  year={2020},
  publisher={Elsevier}
}

@inproceedings{povey2011kaldi,
  title={The Kaldi speech recognition toolkit},
  author={Povey, Daniel and Ghoshal, Arnab and Boulianne, Gilles and Burget, Lukas and Glembek, Ondrej and Goel, Nagendra and Hannemann, Mirko and Motlicek, Petr and Qian, Yanmin and Schwarz, Petr and others},
  booktitle={IEEE 2011 workshop on automatic speech recognition and understanding},
  number={CONF},
  year={2011},
  organization={IEEE Signal Processing Society}
}

@article{ravanelli2021speechbrain,
  title={SpeechBrain: A general-purpose speech toolkit},
  author={Ravanelli, Mirco and Parcollet, Titouan and Plantinga, Peter and Rouhe, Aku and Cornell, Samuele and Lugosch, Loren and Subakan, Cem and Dawalatabad, Nauman and Heba, Abdelwahab and Zhong, Jianyuan and others},
  journal={arXiv preprint arXiv:2106.04624},
  year={2021}
}

@article{paszke2019pytorch,
  title={Pytorch: An imperative style, high-performance deep learning library},
  author={Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and others},
  journal={Advances in neural information processing systems},
  volume={32},
  year={2019}
}

@inproceedings{matejka2017analysis,
  title={Analysis of Score Normalization in Multilingual Speaker Recognition.},
  author={Matejka, Pavel and Novotn{\`y}, Ondrej and Plchot, Oldrich and Burget, Lukas and S{\'a}nchez, Mireia Diez and Cernock{\`y}, Jan},
  booktitle={INTERSPEECH},
  pages={1567--1571},
  year={2017}
}

@inproceedings{fan2020cn,
  title={Cn-celeb: a challenging chinese speaker recognition dataset},
  author={Fan, Yue and Kang, JW and Li, LT and Li, KC and Chen, HL and Cheng, ST and Zhang, PY and Zhou, ZY and Cai, YQ and Wang, Dong},
  booktitle={ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={7604--7608},
  year={2020},
  organization={IEEE}
}

@article{li2022cn,
  title={CN-Celeb: multi-genre speaker recognition},
  author={Li, Lantian and Liu, Ruiqi and Kang, Jiawen and Fan, Yue and Cui, Hao and Cai, Yunqi and Vipperla, Ravichander and Zheng, Thomas Fang and Wang, Dong},
  journal={Speech Communication},
  year={2022},
  publisher={Elsevier}
}

@inproceedings{ren2018learning,
  title={Learning to reweight examples for robust deep learning},
  author={Ren, Mengye and Zeng, Wenyuan and Yang, Bin and Urtasun, Raquel},
  booktitle={International conference on machine learning},
  pages={4334--4343},
  year={2018},
  organization={PMLR}
}

@article{kim2021imbalanced,
  title={Imbalanced image classification with complement cross entropy},
  author={Kim, Yechan and Lee, Younkwan and Jeon, Moongu},
  journal={Pattern Recognition Letters},
  volume={151},
  pages={33--40},
  year={2021},
  publisher={Elsevier}
}

@article{bachman2019learning,
  title={Learning representations by maximizing mutual information across views},
  author={Bachman, Philip and Hjelm, R Devon and Buchwalter, William},
  journal={Advances in neural information processing systems},
  volume={32},
  year={2019}
}

@inproceedings{chen2020simple,
  title={A simple framework for contrastive learning of visual representations},
  author={Chen, Ting and Kornblith, Simon and Norouzi, Mohammad and Hinton, Geoffrey},
  booktitle={International conference on machine learning},
  pages={1597--1607},
  year={2020},
  organization={PMLR}
}

@inproceedings{chen2017sampling,
  title={On sampling strategies for neural network-based collaborative filtering},
  author={Chen, Ting and Sun, Yizhou and Shi, Yue and Hong, Liangjie},
  booktitle={Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
  pages={767--776},
  year={2017}
}

@article{zhang2022mfa,
  title={MFA-Conformer: Multi-scale Feature Aggregation Conformer for Automatic Speaker Verification},
  author={Zhang, Yang and Lv, Zhiqiang and Wu, Haibin and Zhang, Shanshan and Hu, Pengfei and Wu, Zhiyong and Lee, Hung-yi and Meng, Helen},
  journal={arXiv preprint arXiv:2203.15249},
  year={2022}
}

@article{vaswani2017attention,
  title={Attention is all you need},
  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
  journal={Advances in neural information processing systems},
  volume={30},
  year={2017}
}

@article{dai2021coatnet,
  title={Coatnet: Marrying convolution and attention for all data sizes},
  author={Dai, Zihang and Liu, Hanxiao and Le, Quoc V and Tan, Mingxing},
  journal={Advances in Neural Information Processing Systems},
  volume={34},
  pages={3965--3977},
  year={2021}
}

@article{wortsman2022model,
  title={Model soups: averaging weights of multiple fine-tuned models improves accuracy without increasing inference time},
  author={Wortsman, Mitchell and Ilharco, Gabriel and Gadre, Samir Yitzhak and Roelofs, Rebecca and Gontijo-Lopes, Raphael and Morcos, Ari S and Namkoong, Hongseok and Farhadi, Ali and Carmon, Yair and Kornblith, Simon and others},
  journal={arXiv preprint arXiv:2203.05482},
  year={2022}
}

@article{thienpondt2021integrating,
  title={Integrating frequency translational invariance in tdnns and frequency positional information in 2d resnets to enhance speaker verification},
  author={Thienpondt, Jenthe and Desplanques, Brecht and Demuynck, Kris},
  journal={arXiv preprint arXiv:2104.02370},
  year={2021}
}

@article{zhang2021multi,
  title={Multi-level transfer learning from near-field to far-field speaker verification},
  author={Zhang, Li and Wang, Qing and Lee, Kong Aik and Xie, Lei and Li, Haizhou},
  journal={arXiv preprint arXiv:2106.09320},
  year={2021}
}